
#include <sys/cdefs.h>
#include <cstdio>
#include <cstdlib>
typedef double doublev4 __attribute__((mode(V4DF)));
template<int IDZ>
int const_abs(){
  return (IDZ < 0? -IDZ: IDZ);
}
__always_inline void print_doublev4(doublev4 v){
  double *a = (double*)&v;
  printf("%f %f %f %f\n", a[0], a[1], a[2], a[3]);
}
template<int NDZ, int IDZ, int END>
struct unrolled_loop{
  
  __always_inline static void run(doublev4 &e0v4, doublev4 &e1v4, int k, double *q, double *gdirect) {
    // printf("%d %d %d\n", NDZ, IDZ, END);
    // doublev4 e0v4 = __builtin_sw_vldd_f(e + k);
    // doublev4 e1v4 = __builtin_sw_vldd_f(e + k + 4);
    doublev4 q0v4 = __builtin_sw_vldd_u_f(q + k + IDZ);
    doublev4 q4v4 = __builtin_sw_vldd_u_f(q + k + IDZ + 4);
    doublev4 q8v4 = __builtin_sw_vldd_u_f(q + k + IDZ + 8);
    /*[4,5,6,7] [0,1,2,3] -> [2,3,6,7] (1,0,3,2) -> 0x4e*/
    doublev4 q2v4 = __builtin_sw_slave_vshuffled(q4v4, q0v4, 0x4e);
    /*[2,3,4,5] [0,1,2,3] -> [1,2,3,4] (2,1,2,1) -> 0x99*/
    doublev4 q1v4 = __builtin_sw_slave_vshuffled(q2v4, q0v4, 0x99);
    doublev4 q3v4 = __builtin_sw_slave_vshuffled(q4v4, q2v4, 0x99);
    doublev4 q6v4 = __builtin_sw_slave_vshuffled(q8v4, q4v4, 0x4e);
    doublev4 q5v4 = __builtin_sw_slave_vshuffled(q6v4, q4v4, 0x99);
    doublev4 q7v4 = __builtin_sw_slave_vshuffled(q8v4, q6v4, 0x99);
    // doublev4 q1v4 = __builtin_sw_vldd_u_f(q + k + IDZ + 1);
    // doublev4 q2v4 = __builtin_sw_vldd_u_f(q + k + IDZ + 2);
    // doublev4 q3v4 = __builtin_sw_vldd_u_f(q + k + IDZ + 3);
    // print_doublev4(q2v4);
    // print_doublev4(q2v4_shf);
    // print_doublev4(q1v4);
    // print_doublev4(q1v4_shf);
    // print_doublev4(q3v4);
    // print_doublev4(q3v4_shf);
    // exit(0);
    doublev4 gd0v4 = __builtin_sw_ldde(gdirect + const_abs<IDZ>());
    doublev4 gd1v4 = __builtin_sw_ldde(gdirect + const_abs<IDZ+1>());
    doublev4 gd2v4 = __builtin_sw_ldde(gdirect + const_abs<IDZ+2>());
    doublev4 gd3v4 = __builtin_sw_ldde(gdirect + const_abs<IDZ+3>());
    // doublev4 gd4v4 = __builtin_sw_ldde(gdirect + const_abs<IDZ+4>());
    // doublev4 gd5v4 = __builtin_sw_ldde(gdirect + const_abs<IDZ+5>());
    // doublev4 gd6v4 = __builtin_sw_ldde(gdirect + const_abs<IDZ+6>());
    // doublev4 gd7v4 = __builtin_sw_ldde(gdirect + const_abs<IDZ+7>());
    e0v4 += q0v4 * gd0v4 + q1v4 * gd1v4 + q2v4 * gd2v4 + q3v4 * gd3v4;
    e1v4 += q4v4 * gd0v4 + q5v4 * gd1v4 + q6v4 * gd2v4 + q7v4 * gd3v4;
    // __builtin_sw_vstd_f(e + k, e0v4);
    // __builtin_sw_vstd_f(e + k + 4, e1v4);
    unrolled_loop<NDZ, IDZ + 4, (IDZ>=NDZ)>::run(e0v4, e1v4, k, q, gdirect);
  }
  
};
template<int NDZ, int IDZ>
struct unrolled_loop<NDZ, IDZ, 1>{
  
  __always_inline static void run(doublev4 &e0v4, doublev4 &e1v4, int k, double *q, double *gdirect) {
    // puts(__PRETTY_FUNCTION__);
  }
};
const int ndz = 13;
int main(int argc, char **argv){
  int nk = 64;
  if (argc > 1) nk = atoi(argv[1]);
  double e[64], etest[64];
  double q[64 + 32];
  double g[20];
  double *q_off = q + 16;
  for (int i = 0; i < 64; i ++) {
    e[i] = 0;
    etest[i] = 0;
  }
  for (int i = 0; i < 96; i ++) {
    q[i] = i;
  }
  for (int i = 0; i < 20; i ++) {
    g[i] = i;
    if (i > ndz) g[i] = 0;
  }
  asm volatile("":::"memory");
  for (int k = 0; k < nk; k ++) {
    for (int dz = -ndz; dz < 0; dz ++) {
      e[k] += g[-dz] * q_off[k + dz];
    }
    for (int dz = 0; dz <= ndz; dz ++) {
      e[k] += g[dz] * q_off[k + dz];
    }
  }
  long st, ed;
  asm volatile("rcsr %0, 4\n\t":"=r"(st));
  for (int i = 0; i < 100; i ++){
    for (int k = 0; k < nk; k += 8){
      doublev4 e0v4 = __builtin_sw_vldd_f(e + k);
      doublev4 e1v4 = __builtin_sw_vldd_f(e + k + 4);
      unrolled_loop<13, -16, 0>::run(e0v4, e1v4, k, q_off, g);
      __builtin_sw_vstd_f(e + k, e0v4);
      __builtin_sw_vstd_f(e + k + 4, e1v4);
    }
  }
  asm volatile("rcsr %0, 4\n\t":"=r"(ed));
  for (int i = 0; i < nk; i ++){
    printf("%f %f %f\n", e[i], etest[i], e[i]*100 - etest[i]);
  }
  int eff_flops = (ndz * 2 + 1) * nk * 2;
  printf("%ld %ld\n", ed - st, eff_flops);
}